#   Libaut
#       outcome regressions: simple, leave-pair-out
#       Figure 7
#   Juraj Medzihorsky
#   2021-08-12

library(mgcv)
library(parallel)
options(mc.cores=3)
options(stringsAsFactors=F)
source('helpers.R')

#   sessionInfo()
##  R version 3.6.3 (2020-02-29)
##  Platform: x86_64-pc-linux-gnu (64-bit)
##  Running under: Ubuntu 16.04.7 LTS
##
##  Matrix products: default
##  BLAS/LAPACK: /opt/OpenBLAS/lib/libopenblas_haswellp-r0.3.9.so
##
##  locale:
##   [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_GB.UTF-8       LC_NAME=C
##   [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
##  attached base packages:
##  [1] parallel  stats     graphics  grDevices utils     datasets  methods   base
##
##  other attached packages:
##  [1] mgcv_1.8-28    nlme_3.1-152   nvimcom_0.9-28 colorout_1.2-0
##
##  loaded via a namespace (and not attached):
##  [1] compiler_3.6.3  Matrix_1.3-4    tools_3.6.3     splines_3.6.3   grid_3.6.3      lattice_0.20-44

#------------------------------------------------------------------------------

code_dir <- getwd()
data_dir <- gsub('code$', 'data', code_dir)
figs_dir <- gsub('code$', 'figures', code_dir)
tabs_dir <- gsub('code$', 'tables', code_dir)

setwd(data_dir)
dat <- readRDS('outcome_simple_20210810.rds')
load('outcome_regressions_simple_lpo_20210812.rdata')
setwd(code_dir)

#------------------------------------------------------------------------------
#   data prep

dat$lag1_e_migdpgro[dat$dem_ep_id%in%'HRV_1992_2004'] <- NA
dat$reg_fac <- as.factor(dat$e_regionpol_6C)

#   labels for regions
reglabs <- c('Eastern Europe\n& Central Asia',
             'Latin America\n& the Caribbean',
             'MENA, ISR, TUR',
             'Sub-Saharan\nAfrica',
             'Western Europe,\nNorth America,\nCYP, AUS, NZ',
             'Asia\n& Pacific')

#   labels for continuous Xs
clabs <-
    c('ln GDPpc', 'GDPpc growth', 'ln Population in thousands',
      'Power distributed by social group', 'Equal distribution of resources',
      'Presidentialism', 'CSO participatory environment', 'Electoral
      democracy', 'Exclusive regional EDI', 'Year')

#------------------------------------------------------------------------------
#   summarize the CVs

cv_auc_d <- sapply(out_d, function(x) mean(x$auc))
cv_auc_c <- sapply(out_c, function(x) mean(x$auc))

cv_cc_d <- sapply(out_d, function(x) mean(x$cc))
cv_cc_c <- sapply(out_c, function(x) mean(x$cc))

#------------------------------------------------------------------------------
#   models

#   formulas
fy0 <- 'epy ~ 1'
#
dummies <- c('reg_fac', 'e_miinteco', 'e_miinterc')
#
conties <- 
    c('e_migdppcln', 'e_migdpgro', 'logpop', 'v2pepwrsoc', 'v2xeg_eqdr',
      'v2xnp_pres', 'v2csprtcpt', 'v2x_polyarchy', 'excl_region_edi', 'year')  
#
dummies <- c(dummies[1], paste0('lag1_', dummies[2:3]))
conties <- c(paste0('lag1_', conties[1:9]), conties[10])
fd <- sapply(paste0(fy0, ' + ', dummies), as.formula)
fc <- sapply(paste0(fy0, ' + ', 's(', conties, ', bs="gp")'), as.formula)
fc[[9]] <- as.formula(paste0(fy0, ' + ', 's(', conties[9], ', bs="re")'))


#   fitted (predicted) values with thier ses
getfitted <- 
    function(x) 
    {
        d <- x$model
        f <- predict(x, newdata=d, se.fit=T, type='response')
        d$fit <- f$fit
        d$se.fit <- f$se.fit
        return(d)
    }


#   fit the models to the available subsets
ld <- mclapply(fd, glm, data=dat, family=binomial('logit'))
lc <- mclapply(fc, gam, data=dat, family=binomial('logit'))

#   get the fitted values
pd <- mclapply(ld, getfitted) 
pc <- mclapply(lc, getfitted) 
#   sizes of the available subsets
nd <- sapply(pd, nrow)
nc <- sapply(pc, nrow)

#   in-sample AUC-ROC
aucd <- lapply(pd, function(x) getROC_AUC(x$fit, x$epy))
aucc <- lapply(pc, function(x) getROC_AUC(x$fit, x$epy))
#
auc_d <- sapply(aucd, function(x) x$auc)
auc_c <- sapply(aucc, function(x) x$auc)

#-------------------------------------------------------------------------------
#   predicted values for the plot

#   newdata for predicted values under discrete X
ddat <- list(data.frame(reg_fac=as.factor(1:6)),
             data.frame(lag1_e_miinteco=c(0,1)),
             data.frame(lag1_e_miinterc=c(0,1)))
#   newdata for predicted values under continuous X
newdat <- apply(dat[,conties], 2, function(x) seq(min(x, na.rm=T), max(x, na.rm=T), length.out=101))
newdat <- as.data.frame(newdat)

#   compute the predicted values
fitd <- lapply(1:3, function(i) predict(ld[[i]], newdata=ddat[[i]], se.fit=T, type='response')) 
fitc <- lapply(lc, function(i) predict(i, newdata=newdat, se.fit=T, type='response'))

#-------------------------------------------------------------------------------
#   the plot (Figure 7)

setwd(figs_dir)
pdf('Figure_7.pdf', 7, 8)
par(mfrow=c(3,4), mar=c(4,2,0.5,0.5), oma=c(0,1,0,0), las=1, cex=0.5)
yl <- c(0,1)+c(-1,+1)*gr(-7)
for (i in 1:10) {
    plot.new()
    plot.window(xlim=range(newdat[,conties[i]]), ylim=yl)
    rug(dat[dat$epy%in%1,conties[i]], side=3, col=natcol('blue'))
    rug(dat[dat$epy%in%0,conties[i]], side=1, col=natcol('red'))
    box(lwd=gr(-1), col=grey(0.7))
    abline(h=c(0,1), col=grey(0.7), lty=2, lwd=gr(-2))
    av <- fitc[[i]]$fit
    lo <- pmax(0, av - 2 * fitc[[i]]$se.fit)
    hi <- pmin(1, av + 2 * fitc[[i]]$se.fit)
    tx <- newdat[,conties[i]]
    polygon(c(tx,rev(tx)), c(lo,rev(hi)), col=grey(0.9), border=grey(0.9), lwd=1e-2)
    lines(tx, av, col=grey(0.4))
    a1 <- axTicks(1)
    a2 <- seq(0, 1, by=0.2)
    pu <- par('usr')
    xm <- mean(pu[1:2])
    if (substr(clabs[i],1,2)=='ln') {
        if (clabs[i]=='ln GDPpc') {
            al1 <- c('500', '5000', '50000')  
            a1 <- log(as.numeric(al1))
            al1 <- c('500', '5,000', '50,000')  
        } else if (clabs[i]=='ln Population in thousands') {
            al1 <- c('50', '500', '5000', '50000', '500000')  
            a1 <- log(as.numeric(al1))
            al1 <- c(expression(5%*%10^1), expression(5%*%10^2), 
                     expression(5%*%10^3), expression(5%*%10^4), expression(5%*%10^5))
        }
        axis(1, a1, rep('', length(a1)), lwd=0, lwd.ticks=gr(-1), tck=-gr(1)*1e-2, col=grey(0.7))
        axis(1, a1, al1, lwd=0, line=-gr(-1))
    } else {
        axis(1, a1, rep('', length(a1)), lwd=0, lwd.ticks=gr(-1), tck=-gr(1)*1e-2, col=grey(0.7))
        axis(1, lwd=0, line=-gr(-1))
    }
    axis(1, xm, clabs[i], lwd=0, line=gr(-1))
    if (i %in% c(1,5,9)) {
        axis(2, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=-gr(1)*1e-2, col=grey(0.7))
        axis(2, lwd=0, line=-gr(-1))
    }
    axis(2, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
    axis(4, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
    ny1 <- sum(pc[[i]]$epy%in%1)
    ny0 <- format(sum(pc[[i]]$epy%in%0), big.mark=',')
    em <- strheight('M')
    text(xm, 1+em*(1-gr(-3)), if(i==1) { bquote(N['Y=1']==.(ny1)) } else { bquote(.(ny1))}, col=natcol('blue'))
    text(xm, 0-em*(1-gr(-3)), if(i==1) { bquote(N['Y=0']==.(ny0)) } else { bquote(.(ny0))}, col=natcol('red'))
    xaux <- xm
    text(xaux, 1 - strheight('M', cex=gr(1)), paste0('LPO-AUC = ', round(cv_auc_c[i], 2)), pos=1, offset=0)
}
    plot.new()
    plot.window(xlim=c(0,4), ylim=yl)
    lines(c(gr(-4),2-gr(-4)), c(0,0), col=grey(0.7), lty=2, lwd=gr(-2))
    lines(c(gr(-4),2-gr(-4)), c(1,1), col=grey(0.7), lty=2, lwd=gr(-2))
    lines(c(2+gr(-4),4-gr(-4)), c(0,0), col=grey(0.7), lty=2, lwd=gr(-2))
    lines(c(2+gr(-4),4-gr(-4)), c(1,1), col=grey(0.7), lty=2, lwd=gr(-2))
    rect(gr(-4), pu[3], 2-gr(-4), pu[4], lwd=gr(-1), border=grey(0.7))
    rect(2+gr(-4), pu[3], 4-gr(-4), pu[4], lwd=gr(-1), border=grey(0.7))
    axis(1, c(1,3), c('International', 'Domestic'), lwd=0, line=+gr(-1))
    axis(1, 2, 'Armed Conflict', lwd=0, line=+gr(+1))
    td <- dist(par('usr')[1:2])*gr(1)*1e-2
    invisible(sapply(a2, function(j) lines(0+gr(-4)+c(0,td), rep(j, 2), col=grey(0.7), lwd=gr(-1))))
    invisible(sapply(a2, function(j) lines(2+gr(-4)+c(0,td), rep(j, 2), col=grey(0.7), lwd=gr(-1))))
    invisible(sapply(a2, function(j) lines(2-gr(-4)-c(0,td), rep(j, 2), col=grey(0.7), lwd=gr(-1))))
    invisible(sapply(a2, function(j) lines(4-gr(-4)-c(0,td), rep(j, 2), col=grey(0.7), lwd=gr(-1))))
    text(1, 1 - strheight('M', cex=gr(1)), paste0('LPO-AUC =\n', round(cv_auc_d[2], 2)), pos=1, offset=0)
    text(3, 1 - strheight('M', cex=gr(1)), paste0('LPO-AUC =\n', round(cv_auc_d[3], 2)), pos=1, offset=0)
    text(1, 1, sum(pd[[2]]$epy%in%1), col=natcol('blue'), pos=3, offset=gr(-3))
    text(1, 0, format(sum(pd[[2]]$epy%in%0), big.mark=','), col=natcol('red'),  pos=1, offset=gr(-3))
    text(3, 1, sum(pd[[3]]$epy%in%1), col=natcol('blue'), pos=3, offset=gr(-3))
    text(3, 0, format(sum(pd[[3]]$epy%in%0), big.mark=','), col=natcol('red'),  pos=1, offset=gr(-3))
    lines(c(1,1)-gr(-3),
          c(fitd[[2]]$fit[1]-1.96*fitd[[2]]$se.fit[1],
            fitd[[2]]$fit[1]+1.96*fitd[[2]]$se.fit[1]),
          col=grey(0.4), lend='butt')
    lines(c(1,1)+gr(-3),
          c(fitd[[2]]$fit[2]-1.96*fitd[[2]]$se.fit[2],
            fitd[[2]]$fit[2]+1.96*fitd[[2]]$se.fit[2]),
          col=grey(0.4), lend='butt')
    lines(c(3,3)-gr(-3),
          c(fitd[[3]]$fit[1]-1.96*fitd[[3]]$se.fit[1],
            fitd[[3]]$fit[1]+1.96*fitd[[3]]$se.fit[1]),
          col=grey(0.4), lend='butt')
    lines(c(3,3)+gr(-3),
          c(fitd[[3]]$fit[2]-1.96*fitd[[3]]$se.fit[2],
            fitd[[3]]$fit[2]+1.96*fitd[[3]]$se.fit[2]),
          col=grey(0.4), lend='butt')
    points(1+c(-1,+1)*gr(-3), fitd[[2]]$fit, pch=16, col=grey(0.4))
    points(3+c(-1,+1)*gr(-3), fitd[[3]]$fit, pch=16, col=grey(0.4))
    a1 <- c(1+c(-1,+1)*gr(-3), 3+c(-1,+1)*gr(-3))
    axis(1, a1, rep('', length(a1)), lwd=0, lwd.ticks=gr(-1), tck=-gr(1)*1e-2, col=grey(0.7))
    axis(1, a1, c('0', '1', '0', '1'), lwd=0, line=-gr(-1))
#
    plot.new()
    plot.window(xlim=c(0,7)+c(-1,1)*gr(-2), ylim=yl)
    abline(h=c(0,1), col=grey(0.7), lty=2, lwd=gr(-2))
    box(lwd=gr(-1), col=grey(0.7))
    for (i in 1:6) {
        lines(rep(i,2), 
              c(max(0, fitd[[1]]$fit[i]-1.96*fitd[[1]]$se.fit[i]),    
                min(1, fitd[[1]]$fit[i]+1.96*fitd[[1]]$se.fit[i])),
              col=grey(0.4), lend='butt')
    }
    pu <- par('usr')
    xm <- mean(pu[1:2])
    a1 <- 1:6 
    points(a1, fitd[[1]]$fit, pch=16, col=grey(0.4))
    text(xm, 1, sum(pd[[1]]$epy%in%1), col=natcol('blue'), pos=3, offset=gr(-3))
    text(xm, 0, format(sum(pd[[1]]$epy%in%0), big.mark=','), col=natcol('red'),  pos=1, offset=gr(-3))
    text(dist(pu[1:2])/5,
         1 - strheight('M', cex=gr(1)), paste0('LPO-AUC = ', round(cv_auc_d[1], 2)), pos=1, offset=0)
    axis(1, xm, 'Region ', lwd=0, line=gr(-1))
    axis(2, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
    axis(4, a2, rep('', length(a2)), lwd=0, lwd.ticks=gr(-1), tck=+gr(1)*1e-2, col=grey(0.7))
    text(1:6, fitd[[1]]$fit, reglabs, cex=gr(-1), pos=c(4,4,2,4,4,4), offset=gr(-2))
    mtext('Predicted probability', side=2, cex=gr(-1), outer=T, las=3)
    mtext('Predicted probability', side=2, cex=gr(-1), outer=T, las=3, at=1/6)
    mtext('Predicted probability', side=2, cex=gr(-1), outer=T, las=3, at=5/6)
dev.off()

#   SCRIPT END